rm(list=ls())
library(data.table)
library(ggplot2)
library(stargazer)
library(viridis)  
library(patchwork)
library(scales)
library(fixest)
library(etable)
library(usmap)
library(dplyr)


JS_wide <- fread(file="JS_wide.csv")

df <- as.data.frame(JS_wide)

jobsat_monthly <- JS_wide[!is.na(jobsat), 
                  .(jobsat = mean(jobsat, na.rm = TRUE), n_counties  = uniqueN(StateCounty)),
                  by = .(year, month)][order(year, month)]
jobsat_monthly[,yearmonth := as.IDate(sprintf("%d-%02d-01", year, month))]
max_n <- max(jobsat_monthly$n_counties, na.rm = TRUE)
jobsat_monthly[,jobsat_rescaled := jobsat * (n_counties / max_n)]


wforce_monthly <- JS_wide[,.(wforce = sum(labor_force, na.rm = TRUE)),
  by = .(year, month)][order(year, month)]
wforce_monthly[,yearmonth := as.IDate(sprintf("%d-%02d-01", year, month))]

unemp_monthly <- JS_wide[,.(
    unemp = weighted.mean(unemployment_rate,w = labor_force,na.rm = TRUE)),
  by = .(year, month)][order(year, month)]
unemp_monthly[,yearmonth := as.IDate(sprintf("%d-%02d-01", year, month))]

compare <- merge(unemp_monthly[, .(yearmonth, unemp)],
                 jobsat_monthly[, .(yearmonth, jobsat, jobsat_rescaled)],
                 by = "yearmonth")

compare <- merge(compare[, .(yearmonth, unemp, jobsat, jobsat_rescaled)],
                 wforce_monthly[, .(yearmonth, wforce)],
                 by = "yearmonth")

round(cor(compare[,c("jobsat", "jobsat_rescaled","wforce", "unemp")]),2)



compare[, date := as.Date(yearmonth)]

# Plot 1: Unemployment
p1 <- ggplot(compare, aes(x = date, y = unemp)) +
  geom_line(color = "steelblue", size = 1) +
  labs(x = NULL, y = "Unempl. rate (%)") +
  theme_minimal(base_size = 13)

# Plot 2: Labor force (millions)
p2 <- ggplot(compare, aes(x = date, y = wforce)) +
  geom_line(color = "darkred", size = 1) +
  scale_y_continuous(
    labels = label_number(scale = 1e-6, accuracy = 1),
    name = "Labor force (m)"
  ) +
  labs(x = NULL) +
  theme_minimal(base_size = 13)

# Plot 3: the rescaled job indicator
p3 <- ggplot(compare, aes(x = date, y = jobsat_rescaled)) +
  geom_line(color = "forestgreen", size = 1) +
  labs(x = NULL, y = "Job-satisfaction") +
  theme_minimal(base_size = 13)

p0 <- p1 / p2 / p3
p0

ggsave(p0, file="unemp_lforce.pdf", width=9, height=6)

# LOGIT modeling
mod1 <- glm(
  I(jobsat > 0) ~ unemployment_rate  +I(labor_force/1000)+
    factor(year) + factor(month) + validtweets_jobsat,
  family = "binomial",
  data = df
)

mod2 <- glm(
  I(jobsat > 0) ~ unemployment_rate + I(labor_force/1000) + I(acs5/1000) +
    factor(year) + factor(month) + validtweets_jobsat,
  family = "binomial",
  data = df
)

mod3 <- glm(
  I(jobsat > 0) ~ unemployment_rate * I(labor_force/1000) + I(acs5/1000) + 
    factor(year) + factor(month)+ validtweets_jobsat,
  family = "binomial",
  data = df
)

mod4 <- glm(
  I(jobsat > 0) ~ unemployment_rate * I(labor_force/1000) + I(acs5/1000) + rural +
    factor(year) + factor(month) + factor(State) + validtweets_jobsat,
  family = "binomial",
  data = df
)


vars <- c("employed", "labor_force", "unemployed", "unemployment_rate",
          "rural", "population", "validtweets_jobsat", "acs5")

cors <- round(cor(df[, c("jobsat", vars)], use = "pair")["jobsat", ], 2)
cors <- cors[-1]
tab <- rbind(
  Variable = vars,
  Correlation = as.character(cors)
)

tab
xtable::xtable(tab)

stargazer(
  list(mod1,mod2,mod3, mod4),
  type = "latex",  # 
  title = "Logit model for P(jobsat > 0)",
  dep.var.labels = "P(jobsat > 0)",
  covariate.labels = c(
    "Unemployment rate",
    "Labor force (thousands persons)",
    "ACS5 income (thousands USD)",
    "Rurality",
    "Valid jobsat tweets"
  ),
  omit = c("factor\\(year\\)", "factor\\(month\\)", "factor\\(State\\)"),
  no.space = TRUE,
  digits = 3,
  notes = c(
    "Year and month coefficients omitted.",
    "Model (4) includes fixed effects for State."
  ),
  notes.align = "r"   # left-align notes
)


# This version is needed to calculate the clustered SE, point estimates are the same

## Model 1: unemployment + labor force + FE(year, month)
mod1_fe <- feglm(
  I(jobsat > 0) ~ unemployment_rate + I(labor_force/1000) + validtweets_jobsat |
    year + month,
  family = "binomial",
  data   = df
)

## Model 2: add ACS5 income
mod2_fe <- feglm(
  I(jobsat > 0) ~ unemployment_rate + I(labor_force/1000) + I(acs5/1000) +
    validtweets_jobsat | 
    year + month,
  family = "binomial",
  data   = df
)

## Model 3: add interaction unemployment × labor force
mod3_fe <- feglm(
  I(jobsat > 0) ~ unemployment_rate * I(labor_force/1000) + I(acs5/1000) +
    validtweets_jobsat | 
    year + month,
  family = "binomial",
  data   = df
)

## Model 4: add rurality + State FE
mod4_fe <- feglm(
  I(jobsat > 0) ~ unemployment_rate * I(labor_force/1000) + I(acs5/1000) +
    rural + validtweets_jobsat | 
    year + month + State,
  family = "binomial",
  data   = df
)

etable(
  list(
    "Model (1)" = mod1_fe,
    "Model (2)" = mod2_fe,
    "Model (3)" = mod3_fe,
    "Model (4)" = mod4_fe
  ),
  vcov   = ~ StateCounty,   # clustered SE at county level
  tex    = TRUE,            # LaTeX code
  dict   = c(               # rename coefficients
    "unemployment_rate"                    = "Unemployment rate",
    "I(labor_force/1000)"                  = "Labor force (thousands persons)",
    "I(acs5/1000)"                         = "ACS5 income (thousands USD)",
    "rural"                                = "Rurality",
    "validtweets_jobsat"                   = "Valid jobsat tweets",
    "unemployment_rate:I(labor_force/1000)" =
      "Unemployment rate $\\times$ Labor force (thousands persons)"
  ),
  fitstat = c("n", "pr2",  "sq.cor" , "aic", "bic")   # N, logLik, AIC like in your current table
)


p     <- predict(mod4, type = "response")
d     <- p * (1 - p)
d_bar <- mean(d, na.rm = TRUE)

mf <- model.frame(mod4)
u  <- mf[["unemployment_rate"]]
lf <- mf[["I(labor_force/1000)"]]
V <- sandwich::vcovCL(mod4, cluster = df$StateCounty)  # o ~ StateCounty se vuoi formula
b <- coef(mod4)

vars_main <- c("unemployment_rate",
               "I(labor_force/1000)",
               "I(acs5/1000)",
               "rural",
               "validtweets_jobsat")

int_term <- "unemployment_rate:I(labor_force/1000)"

## Calculating AME and significance based on clustered SE
AME    <- setNames(numeric(length(vars_main) + 1),
                   c(vars_main, int_term))
AME_se <- setNames(numeric(length(vars_main) + 1),
                   c(vars_main, int_term))

## (a) Unemployment: ∂P/∂u = d_i * (β_u + β_int * lf_i)
a1_u <- mean(d, na.rm = TRUE)
a2_u <- mean(d * lf, na.rm = TRUE)

AME["unemployment_rate"] <-
  a1_u * b["unemployment_rate"] + a2_u * b[int_term]

Vu <- V[c("unemployment_rate", int_term),
        c("unemployment_rate", int_term), drop = FALSE]

g_u   <- c(a1_u, a2_u)
var_u <- as.numeric(t(g_u) %*% Vu %*% g_u)
AME_se["unemployment_rate"] <- sqrt(var_u)

## (b) Labor force: ∂P/∂lf = d_i * (β_lf + β_int * u_i)
a1_lf <- mean(d, na.rm = TRUE)
a2_lf <- mean(d * u, na.rm = TRUE)

AME["I(labor_force/1000)"] <-
  a1_lf * b["I(labor_force/1000)"] + a2_lf * b[int_term]

Vlf <- V[c("I(labor_force/1000)", int_term),
         c("I(labor_force/1000)", int_term), drop = FALSE]

g_lf   <- c(a1_lf, a2_lf)
var_lf <- as.numeric(t(g_lf) %*% Vlf %*% g_lf)
AME_se["I(labor_force/1000)"] <- sqrt(var_lf)

## (c) Variabili senza interazioni: ∂P/∂x_k = d̄ * β_k
vars_simple <- setdiff(vars_main,
                       c("unemployment_rate", "I(labor_force/1000)"))

for (v in vars_simple) {
  AME[v]    <- d_bar * b[v]
  AME_se[v] <- abs(d_bar) * sqrt(V[v, v])
}

## (d) Interaction (cross-partial)
AME[int_term]    <- d_bar * b[int_term]
AME_se[int_term] <- abs(d_bar) * sqrt(V[int_term, int_term])

## z, p
z_val <- AME / AME_se
p_val <- 2 * pnorm(-abs(z_val))

lab_map <- setNames(
  c("Unemployment rate",
    "Labor force (thousands persons)",
    "ACS5 income (thousands USD)",
    "Rurality",
    "Valid jobsat tweets",
    "Unemp. × labor force (interaction)"),
  c("unemployment_rate",
    "I(labor_force/1000)",
    "I(acs5/1000)",
    "rural",
    "validtweets_jobsat",
    "unemployment_rate:I(labor_force/1000)")
)

vars_all <- c(vars_main, int_term)

AME_tab <- data.frame(
  Variable = unname(lab_map[vars_all]),
  AME      = as.numeric(AME[vars_all]),
  SE       = as.numeric(AME_se[vars_all]),
  z        = as.numeric(z_val[vars_all]),
  p_value  = as.numeric(p_val[vars_all]),
  row.names = NULL
)

AME_tab_print <- within(AME_tab, {
  AME     <- round(AME, 5)
  SE      <- round(SE, 5)
  z       <- round(z, 2)
  p_value <- signif(p_value, 5)
})

AME_tab_print

stargazer(
  AME_tab_print,
  summary   = FALSE,
  rownames  = FALSE,
  type      = "latex",
  title     = "Average marginal effects from the logit model for $P(\\texttt{jobsat} > 0)$",
  label     = "tab:ame_jobsat",
  digits    = 4
)


AME_tab_tex <- AME_tab
AME_tab_tex$AME     <- sprintf("%.5f", AME_tab_tex$AME)   # like R print
AME_tab_tex$SE      <- sprintf("%.5f", AME_tab_tex$SE)
AME_tab_tex$z       <- sprintf("%.2f",  AME_tab_tex$z)
AME_tab_tex$p_value <- sprintf("%.4g",  AME_tab_tex$p_value)  # scientific if needed

colnames(AME_tab_tex)[colnames(AME_tab_tex) == "p_value"] <- "$p$-value"

AME_tab_tex

stargazer(
  AME_tab_tex,
  summary   = FALSE,
  rownames  = FALSE,
  type      = "latex",
  title     = "Average marginal effects from the logit model for $P(\\texttt{jobsat} > 0)$",
  label     = "tab:ame_jobsat",
  digits    = 5
)

### OLS Modeling


mod1_OLS <- lm(
  jobsat  ~ unemployment_rate  +I(labor_force/1000)+
    factor(year) + factor(month) + validtweets_jobsat,
  data = df
)

mod2_OLS <- lm(
  jobsat  ~ unemployment_rate + I(labor_force/1000) + I(acs5/1000) +
    factor(year) + factor(month) + validtweets_jobsat,
  data = df
)

mod3_OLS <- lm(
  jobsat  ~ unemployment_rate * I(labor_force/1000) + I(acs5/1000) + 
    factor(year) + factor(month)+ validtweets_jobsat,
  data = df
)


mod4_OLS <- lm(
  jobsat  ~ unemployment_rate * I(labor_force/1000) + I(acs5/1000) + rural +
    factor(year) + factor(month) + factor(State) + validtweets_jobsat,
  data = df
)




stargazer(
  list(mod1_OLS,mod2_OLS,mod3_OLS, mod4_OLS),
  type = "latex",  # "latex" or "html" for other outputs
  title = "OLS model for jobsat",
  dep.var.labels = "jobsat",
  covariate.labels = c(
    "Unemployment rate",
    "Labor force (thousands persons)",
    "ACS5 income (thousands USD)",
    "Rurality",
    "Valid jobsat tweets"
  ),
  omit = c("factor\\(year\\)", "factor\\(month\\)", "factor\\(State\\)"),
  no.space = TRUE,
  digits = 5,
  notes = c(
    "Year and month coefficients omitted.",
    "Model (4) includes fixed effects for State."
  ),
  notes.align = "r",   # left-align notes
 keep.stat = c("n", "rsq", "adj.rsq")
)


mods_OLS <- list(mod1_OLS, mod2_OLS, mod3_OLS, mod4_OLS)

# clusterized SE
vcov_clus <- lapply(
  mods_OLS,
  function(m) sandwich::vcovCL(m, cluster = ~ StateCounty, type = "HC1")
)

se_clus <- lapply(vcov_clus, function(V) sqrt(diag(V)))

AICv <- sapply(mods_OLS, AIC)

stargazer(
  mods_OLS,
  type = "latex",
  title = "OLS regression model for \\texttt{jobsat}",
  dep.var.labels = "\\texttt{jobsat}",
  se = se_clus,
  covariate.labels = c(
    "Unemployment rate",
    "Labor force (thousands persons)",
    "ACS5 income (thousands USD)",
    "Rurality",
    "Valid jobsat tweets",
    "Unemployment rate $\\times$ Labor force (thousands persons)"
  ),
  omit = c("factor\\(year\\)", "factor\\(month\\)", "factor\\(State\\)"),
  no.space = TRUE,
  digits = 5,
  keep.stat = c("n", "rsq", "adj.rsq"),
  add.lines = list(
    c("Akaike Inf. Crit.", sprintf("%.2f", AICv)),
    c("Year \\& month FE", "Yes", "Yes", "Yes", "Yes"),
    c("State FE", "No", "No", "No", "Yes"),
    c("Clustered SE (county)", "Yes", "Yes", "Yes", "Yes")
  ),
  notes = c(
    "Heteroskedasticity-robust standard errors clustered at the county level (StateCounty)."
  ),
  notes.align = "l"
)

setDT(df)

## ---- reference values for continuous covariates ----
lf_ref    <- median(df$labor_force,        na.rm = TRUE)
acs5_ref  <- median(df$acs5,               na.rm = TRUE)
vt_ref    <- median(df$validtweets_jobsat, na.rm = TRUE)
pop_ref   <- median(df$population,         na.rm = TRUE)

years_use    <- sort(unique(df$year))
rural_levels <- sort(unique(df$rural))

## ---- unemployment grid ----
unemp_grid <- seq(
  from = 0,
  to   = max(df$unemployment_rate, na.rm = TRUE),
  length.out = 50
)

## ---- FE structure: all (year, month, State) combos with weights ----
fe_tbl <- df[, .N, by = .(year, month, State)]   # N = how many obs for weighting

## ---- base grid: unemployment × rural × year ----
grid_year <- as.data.table(expand.grid(
  unemployment_rate = unemp_grid,
  rural             = rural_levels,
  year              = years_use
))

## ---- expand grid by month & State structure (within each year) ----
pred_unemp_rural_year <- merge(
  grid_year,
  fe_tbl,
  by = "year",
  allow.cartesian = TRUE
)

## add continuous covariates (held at medians)
pred_unemp_rural_year[, `:=`(
  labor_force        = lf_ref,
  acs5               = acs5_ref,
  validtweets_jobsat = vt_ref,
  population         = pop_ref
)]

pred_unemp_rural_year[, p_hat :=
                        predict(mod4, newdata = pred_unemp_rural_year,
                                type = "response")]

avg_year_unemp_rural <- pred_unemp_rural_year[
  !is.na(p_hat),
  .(p_hat = stats::weighted.mean(p_hat, w = N)),
  by = .(year, unemployment_rate, rural)
]

avg_unemp_rural <- avg_year_unemp_rural[
  , .(p_hat = mean(p_hat)),
  by = .(unemployment_rate, rural)
]

unemp_mean_all <- mean(df$unemployment_rate, na.rm = TRUE)

## Delta P relative to rural = 1

rural_levels <- sort(unique(avg_unemp_rural$rural))

## ΔP_r(u) = P_r(u) - P_{r=1}(u) at each unemployment rate
avg_unemp_rural[, dP :=
                  p_hat - p_hat[rural == 1],
                by = unemployment_rate
]

## drop rural = 1 for plotting
avg_diff <- avg_unemp_rural[rural != 1]
avg_diff[, rural_f := factor(rural, levels = rural_levels[rural_levels != 1])]

## Plot Delta P vs unemployment rate

p_delta_avg <- ggplot(
  avg_diff,
  aes(x = unemployment_rate,
      y = dP,
      group = rural_f,
      colour = rural_f)
) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey50") +
  geom_line(linewidth = 0.9) +
  geom_vline(xintercept = unemp_mean_all,
             linetype = "dashed",
             colour = "red") +
  scale_color_viridis_d(name = "Rural Level", option = "D") +
  coord_cartesian(ylim = c(min(avg_diff$dP, na.rm = TRUE), 0)) +
  labs(
    title = "P(jobsat > 0 | rural=r) - P(jobsat > 0 | rural=1)",
    x = "Unemployment Rate (%)",
    y = expression(Delta*P)
  ) +
  theme_minimal(base_size = 13)

p_delta_avg   

ggsave(p_delta_avg,file="jobsat.pdf", width=9, height=4)


# plotting MAPs

setDT(df)

df_county <- df[,.(
    jobsat_mean = mean(jobsat, na.rm = TRUE),
    unemp_mean  = mean(unemployment_rate, na.rm = TRUE)),
  by = .(StateCounty)
]

df_county[, jobsat_rescaled := (jobsat_mean + 1) / 2]

setnames(df_county, "StateCounty", "fips")

p_jobsat <- plot_usmap(
  regions = "counties",
  data    = df_county,
  values  = "jobsat_rescaled"
) +
  scale_fill_viridis_c(
    name   = "Job satisfaction\n(rescaled 0–1)",
    option = "magma",      # red → orange → yellow → white
    direction = 1,
    limits = c(0, 1),
    breaks = c(0, 0.5, 1),
    labels = c("0 (low)", "0.5 (threshold)", "1 (high)")
  ) +
  labs(
    title    = "Average Job Satisfaction (2013–2023)",
    subtitle = "Indicator rescaled from [-1,1] to [0,1]; 0.5 corresponds to neutral sentiment"
  ) +
  theme(
    legend.position = "right",
    plot.title      = element_text(hjust = 0.5),
    plot.subtitle   = element_text(hjust = 0.5)
  )

p_jobsat

p_unemp <- plot_usmap(
  regions = "counties",
  data    = df_county,
  values  = "unemp_mean"
) +
  scale_fill_viridis_c(
    name   = "Mean unemployment rate (%)",
    option = "D",
    direction = -1
  ) +
  labs(
    title    = "Average Unemployment Rate (2013–2023)",
    subtitle = "County-level monthly unemployment averaged over time"
  ) +
  theme(
    legend.position = "right",
    plot.title      = element_text(hjust = 0.5),
    plot.subtitle   = element_text(hjust = 0.5)
  )

p_unemp


combined_maps <- p_jobsat / p_unemp + 
  plot_layout(heights = c(1, 1))

combined_maps

ggsave(combined_maps, file="jobsat_unemp_maps.pdf", width=9, height=12)

